Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.
Train/test on NSRDB data to find optimal parameters¶
import seaborn as snspplot = sns.pairplot(train.df[feature_cols + target_cols], hue='sky_status')from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
X = train.df[feature_cols + ['GHI', 'Clearsky GHI pvlib']].values
X_std = StandardScaler().fit_transform(X)
kmeans = DBSCAN().fit(X_std)
# kmeans = KMeans(n_clusters=4, random_state=0).fit(X_std)
train.df['sky_status_km'] = 0
train.df['sky_status_km'] = kmeans.labels_
pd.unique(train.df['sky_status_km'])train.df['sky_status_km'] = (train.df['sky_status_km'] == 0) | (train.df['sky_status_km'] == 0)
x = train.df.index
trace1 = go.Scatter(x=x, y=train.df['GHI'])
trace2 = go.Scatter(x=x, y=train.df['Clearsky GHI pvlib'])
# trace3 = go.Scatter(x=train.df[train.df['sky_status']].index,
# y=train.df[train.df['sky_status']]['GHI'], name='NSRBD', mode='markers')
trace3 = go.Scatter(x=train.df[train.df['sky_status'] & ~train.df['sky_status_km']].index,
y=train.df[train.df['sky_status'] & ~train.df['sky_status_km']]['GHI'], name='NSRBD', mode='markers')
trace4 = go.Scatter(x=train.df[~train.df['sky_status'] & train.df['sky_status_km']].index,
y=train.df[~train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='km', mode='markers')
trace5 = go.Scatter(x=train.df[train.df['sky_status'] & train.df['sky_status_km']].index,
y=train.df[train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='both', mode='markers')
iplot([trace1, trace2, trace3, trace4, trace5])vis = visualize.Visualizer()
vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
Out[10]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
verbose=0, warm_start=False)
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:
Large scaling value. Day will not be further assessed or scaled.
/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:
Large scaling value. Day will not be further assessed or scaled.
with warnings.catch_warnings():
warnings.simplefilter('ignore')
params={}
params['max_depth'] = [4, 8, 12, 16]
params['n_estimators'] = [64]
params['class_weight'] = [None, 'balanced']
params['min_samples_leaf'] = [1, 2, 3]
results = []
for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']):
print('Params:')
print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples))
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('01-01-1999', '01-01-2014')
utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
test2 = cs_detection.ClearskyDetection(train.df)
test2.trim_dates('01-01-2014', '01-01-2015')
clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1, random_state=42)
clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten())
print('\t Scores:')
test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True)
accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred)
print('\t\t accuracy: {}'.format(accuracy_score))
f1_score = metrics.f1_score(test2.df['sky_status'], test_pred)
print('\t\t f1:{}'.format(f1_score))
recall_score = metrics.recall_score(test2.df['sky_status'], test_pred)
print('\t\t recall:{}'.format(recall_score))
precision_score = metrics.precision_score(test2.df['sky_status'], test_pred)
print('\t\t precision:{}'.format(precision_score))
results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples,
'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})runs_df = pd.DataFrame(results)runs_df.to_csv('8_srrl_directional_features.csv')
Out[19]:
|
Unnamed: 0 |
accuracy |
class_weight |
f1 |
max_depth |
min_samples_leaf |
n_estimators |
precision |
recall |
| 0 |
0 |
0.529737 |
NaN |
0.324617 |
4 |
1 |
64 |
0.207656 |
0.743243 |
| 1 |
1 |
0.529737 |
NaN |
0.324617 |
4 |
2 |
64 |
0.207656 |
0.743243 |
| 2 |
2 |
0.529737 |
NaN |
0.324617 |
4 |
3 |
64 |
0.207656 |
0.743243 |
| 3 |
3 |
0.464212 |
balanced |
0.348623 |
4 |
1 |
64 |
0.213842 |
0.942943 |
| 4 |
4 |
0.464212 |
balanced |
0.348623 |
4 |
2 |
64 |
0.213842 |
0.942943 |
| 5 |
5 |
0.464212 |
balanced |
0.348623 |
4 |
3 |
64 |
0.213842 |
0.942943 |
| 6 |
6 |
0.944635 |
NaN |
0.810842 |
8 |
1 |
64 |
0.843750 |
0.780405 |
| 7 |
7 |
0.927968 |
NaN |
0.767673 |
8 |
2 |
64 |
0.753251 |
0.782658 |
| 8 |
8 |
0.926941 |
NaN |
0.765568 |
8 |
3 |
64 |
0.747496 |
0.784535 |
| 9 |
9 |
0.890297 |
balanced |
0.730435 |
8 |
1 |
64 |
0.583072 |
0.977477 |
| 10 |
10 |
0.891781 |
balanced |
0.733258 |
8 |
2 |
64 |
0.586409 |
0.978228 |
| 11 |
11 |
0.899600 |
balanced |
0.747814 |
8 |
3 |
64 |
0.604964 |
0.978979 |
| 12 |
12 |
0.952968 |
NaN |
0.840124 |
12 |
1 |
64 |
0.869478 |
0.812688 |
| 13 |
13 |
0.952568 |
NaN |
0.838295 |
12 |
2 |
64 |
0.870303 |
0.808559 |
| 14 |
14 |
0.952740 |
NaN |
0.839098 |
12 |
3 |
64 |
0.869863 |
0.810435 |
| 15 |
15 |
0.950342 |
balanced |
0.853239 |
12 |
1 |
64 |
0.774816 |
0.949324 |
| 16 |
16 |
0.951199 |
balanced |
0.855696 |
12 |
2 |
64 |
0.777369 |
0.951577 |
| 17 |
17 |
0.947260 |
balanced |
0.846051 |
12 |
3 |
64 |
0.760635 |
0.953078 |
| 18 |
18 |
0.955023 |
NaN |
0.848810 |
16 |
1 |
64 |
0.868132 |
0.830330 |
| 19 |
19 |
0.953196 |
NaN |
0.842731 |
16 |
2 |
64 |
0.861569 |
0.824700 |
| 20 |
20 |
0.954509 |
NaN |
0.845990 |
16 |
3 |
64 |
0.871764 |
0.821697 |
| 21 |
21 |
0.954167 |
balanced |
0.856119 |
16 |
1 |
64 |
0.818992 |
0.896772 |
| 22 |
22 |
0.954224 |
balanced |
0.858254 |
16 |
2 |
64 |
0.810955 |
0.911411 |
| 23 |
23 |
0.953653 |
balanced |
0.857042 |
16 |
3 |
64 |
0.807029 |
0.913664 |
Out[24]:
{'max_depth': 8, 'min_samples_leaf': 3, 'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_recall, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
Out[30]:
{'class_weight': nan,
'max_depth': 16,
'min_samples_leaf': 1,
'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_accuracy, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
Same model as best accuracy and precision - scroll up.
Train on all NSRDB data, test various freq of ground data¶
/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[41]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=16, max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=3,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
Out[46]:
Index(['GHI', 'Clearsky GHI pvlib', 'sky_status pvlib', 'ghi_status', 'scale'], dtype='object')
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:
Large scaling value. Day will not be further assessed or scaled.
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:
Scaling did not converge.
## 15 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-17-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')test.df = test.df[test.df.index.minute % 15 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')## 10 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 10 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')## 5 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-04-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 5 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')## 1 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 1 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')# Save modelimport picklewith open('8_abq_direction_features_model.pkl', 'wb') as f:
pickle.dump(clf, f)!ls *abq*# ConclusionIn general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.